Stops

Downloads + Imports

%run "setup.ipynb"
CPU times: user 263 ms, sys: 174 ms, total: 438 ms
Wall time: 3.94 s
Loading BokehJS ...

Read and format data

%time stops_df = pd.read_csv(zipfile.open('stops.txt'))
stops_df.tail()
stops_df.info()
CPU times: user 110 ms, sys: 11.9 ms, total: 122 ms
Wall time: 122 ms
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41914 entries, 0 to 41913
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stop_id              41914 non-null  object 
 1   stop_code            0 non-null      float64
 2   stop_name            41914 non-null  object 
 3   stop_desc            0 non-null      float64
 4   stop_lat             41914 non-null  float64
 5   stop_lon             41914 non-null  float64
 6   location_type        41914 non-null  int64  
 7   parent_station       28752 non-null  float64
 8   wheelchair_boarding  9101 non-null   float64
 9   platform_code        4436 non-null   object 
 10  zone_id              15369 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.5+ MB
stops_df.fillna('', inplace=True)
stops_df = stops_df.drop(['stop_code', 'stop_desc'], axis=1)
stops_df.loc[stops_df["wheelchair_boarding"] == '','wheelchair_boarding'] = 0
stops_df_multiple_stops = stops_df.copy()
stops_df.drop_duplicates(subset=['stop_name', 'location_type', 'wheelchair_boarding', 'platform_code'],keep='first', inplace = True)
stops_df.head()
stop_id stop_name stop_lat stop_lon location_type parent_station wheelchair_boarding platform_code zone_id
0 000008012713 Rangsdorf, Bahnhof 52.294125 13.431112 0 900000245025.0 0
1 000008010205 Leipzig, Hauptbahnhof 51.344817 12.381321 0 900000550090.0 0
2 000008010327 Senftenberg, Bahnhof 51.526790 14.003977 0 900000435000.0 0
3 000008010324 Schwerin, Hauptbahnhof 53.635261 11.407520 0 900000550112.0 0
4 000008012393 Mühlanger, Bahnhof 51.855704 12.748198 0 900000550319.0 0
stops_df.apply(lambda x: x.unique().size, axis=0)
stop_id                29601
stop_name              13155
stop_lat               13107
stop_lon               13119
location_type              2
parent_station         13121
wheelchair_boarding        2
platform_code             59
zone_id                14622
dtype: int64
# visualization with folium: takes way longer + more memory consumption than bokeh

#f = folium.Figure(width=800, height=600)
#m = folium.Map(location=[45.5236, -122.6750], prefer_canvas=True).add_to(f)
#for lat, lon in zip(stops_df['stop_lat'], stops_df['stop_lon']):
#    folium.CircleMarker(
#        location=[lat, lon],
#        radius=1,
#        color="#3186cc",
#        fill=True,
#        fill_color="#3186cc",
#    ).add_to(m)
#m
def merc_from_arrays(lats, lons):
    r_major = 6378137.000
    x = r_major * np.radians(lons)
    scale = x/lons
    y = 180.0/np.pi * np.log(np.tan(np.pi/4.0 + lats * (np.pi/180.0)/2.0)) * scale
    return (x, y)
p = figure(plot_width=800, plot_height=700,title="Public Transport Stops of VBB",tools="pan,wheel_zoom",
           x_range=(1215654.4978, 1721973.3732), y_range=(6533225.6816, 7296372.9720),
           x_axis_type="mercator", y_axis_type="mercator",
           tooltips=[("Name", "@stop_name"), ("platform", "@platform_code"), ("(Lat, Lon)", "(@stop_lat, @stop_lon)")])
p.add_tile(get_provider(OSM))
stops_df['merc_x'], stops_df['merc_y'] = merc_from_arrays(stops_df['stop_lat'], stops_df['stop_lon'])
p.circle(x='merc_x', y='merc_y', source=stops_df)
show(p)
hv.output(backend="bokeh")
tiles = hv.element.tiles.OSM().opts(alpha=0.5)
stops = hv.Points(stops_df, ['merc_x', 'merc_y'], label='Public Transport Stops')
stops_wa = hv.Points(stops_df.loc[stops_df['wheelchair_boarding'] == 1], ['merc_x', 'merc_y'], label='Wheelchair accessible Stops')
tiles * hd.datashade(stops) + tiles * hd.datashade(stops_wa)

Stations with most stops

stops_df_multiple_stops['stop_name'].value_counts().head(10)
S Potsdam Hauptbahnhof                  26
Potsdam, Medienstadt Babelsberg Bhf     19
Cottbus, Hauptbahnhof                   19
S Königs Wusterhausen Bhf               19
S Wannsee Bhf (Berlin)                  18
Fürstenwalde, Bahnhof                   18
S+U Berlin Hauptbahnhof                 18
S Ostkreuz Bhf (Berlin)                 17
Potsdam, Johannes-Kepler-Platz          17
S+U Zoologischer Garten Bhf (Berlin)    17
Name: stop_name, dtype: int64
num_stops = stops_df_multiple_stops.groupby(['stop_name']).agg(num_stops=('stop_id', 'count')).query('num_stops > 1').sort_values('num_stops', ascending=False)
num_stops.describe()
num_stops
count 13120.000000
mean 3.191997
std 1.318984
min 2.000000
25% 3.000000
50% 3.000000
75% 3.000000
max 26.000000
num_stops_mean = num_stops['num_stops'].mean()
num_stops_median = num_stops['num_stops'].median()

fig, ax = plt.subplots()
sns.histplot(x='num_stops', data=num_stops, color=sns_c[3], ax=ax, discrete=True)
ax.axvline(x=num_stops_mean, color=sns_c[1], linestyle='--', label=f'mean = {num_stops_mean: ,.2f}')
ax.axvline(x=num_stops_median, color=sns_c[4], linestyle='--',label=f'median = {num_stops_median}')
ax.legend(loc='upper right')
ax.set(title='Number of Stops per Location', xlabel='number of stops', xlim=(0, None))
[Text(0.5, 1.0, 'Number of Stops per Location'),
 Text(0.5, 0, 'number of stops'),
 (0.0, 27.75)]
_images/stops_14_1.png

Stops per District

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen, Request, urlretrieve
from collections import OrderedDict

url = "https://www.suche-postleitzahl.org/download_files/public/plz-gebiete.shp.zip"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
}

request  = Request(url, None, headers)
resp = urlopen(request)
data = resp.read()
with ZipMemoryFile(data) as zip_memory_file:
    with zip_memory_file.open('plz-gebiete.shp') as collection:
        # collection.crs is {'init': dst_epsg} -> deprecated format
        plz_shapes = gpd.GeoDataFrame.from_features(collection, crs=collection.crs['init'])
# plz_shapes = gpd.read_file('plz_gebiete.zip')
plz_shapes.head()
geometry plz note
0 POLYGON ((5.86632 51.05110, 5.86692 51.05124, ... 52538 52538 Gangelt, Selfkant
1 POLYGON ((5.94504 51.82354, 5.94580 51.82409, ... 47559 47559 Kranenburg
2 POLYGON ((5.96811 51.05556, 5.96951 51.05660, ... 52525 52525 Waldfeucht, Heinsberg
3 POLYGON ((5.97486 50.79804, 5.97495 50.79809, ... 52074 52074 Aachen
4 POLYGON ((6.01507 50.94788, 6.03854 50.93561, ... 52531 52531 Übach-Palenberg
plz_population = pd.read_csv('plz_einwohner.csv', dtype={'plz': str, 'einwohner': int})
plz_population.head()
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/tmp/ipykernel_2135/379100333.py in <module>
----> 1 plz_population = pd.read_csv('plz_einwohner.csv', dtype={'plz': str, 'einwohner': int})
      2 plz_population.head()

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/pandas/io/parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    608     kwds.update(kwds_defaults)
    609 
--> 610     return _read(filepath_or_buffer, kwds)
    611 
    612 

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    460 
    461     # Create the parser.
--> 462     parser = TextFileReader(filepath_or_buffer, **kwds)
    463 
    464     if chunksize or iterator:

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    817             self.options["has_index_names"] = kwds["has_index_names"]
    818 
--> 819         self._engine = self._make_engine(self.engine)
    820 
    821     def close(self):

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1048             )
   1049         # error: Too many arguments for "ParserBase"
-> 1050         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1051 
   1052     def _failover_to_python(self):

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1865 
   1866         # open handles
-> 1867         self._open_handles(src, kwds)
   1868         assert self.handles is not None
   1869         for key in ("storage_options", "encoding", "memory_map", "compression"):

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/pandas/io/parsers.py in _open_handles(self, src, kwds)
   1360         Let the readers open IOHanldes after they are done with their potential raises.
   1361         """
-> 1362         self.handles = get_handle(
   1363             src,
   1364             "r",

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    640                 errors = "replace"
    641             # Encoding
--> 642             handle = open(
    643                 handle,
    644                 ioargs.mode,

FileNotFoundError: [Errno 2] No such file or directory: 'plz_einwohner.csv'
plz_df = pd.merge(left=plz_shapes[['plz', 'geometry']], right=plz_population, on='plz', how='left')
plz_df.nlargest(5, 'einwohner')
stops_gdf = gpd.GeoDataFrame(stops_df, geometry=gpd.points_from_xy(stops_df['stop_lon'], stops_df['stop_lat']))
stops_gdf.set_crs(epsg=4326, inplace=True)
stops_gdf.head()
join_df = gpd.sjoin(stops_gdf, plz_df, how='inner', op="within")
join_df.drop(['index_right', 'einwohner'], axis=1, inplace=True)
join_df = join_df[join_df['parent_station'] == ""]
join_df.head()
count_df = join_df.groupby('plz', dropna=False).size().reset_index(name='stop_count')
plz_df = pd.merge(left=plz_df[['plz', 'geometry', 'einwohner']], right=count_df, on='plz', how='left')
plz_df['stops_per_inhabitant'] = plz_df.apply(lambda row: np.nan if row['einwohner'] == 0 else row['stop_count'] / row['einwohner'], axis=1)
plz_df.sort_values('stop_count', ascending=False)
fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='einwohner',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.05,
    legend=True
)
ax.set(
    title='Population per PLZ',
    aspect=1.3
);
fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='stop_count',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.1,
    legend=True
)
ax.set(
    title='Stops per PLZ',
    aspect=1.3
);
fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='stops_per_inhabitant',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.1,
)
ax.set(
    title='Stops per Inhabitant per PLZ',
    aspect=1.3
);